Introduction to Clinical DataScience

Mortality prediction

We import some standard python data analysis packages and do authentication on Google Drive

In [ ]:
from __future__ import print_function
from google.colab import auth
from google.cloud import bigquery
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
auth.authenticate_user()
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

Next we will need to enter some information on how to access the data.

analysis_project is the project used for processing the queries.

In [ ]:
#@title Fill out this form then press [shift ⇧]+[enter ⏎] {run: "auto"}
import subprocess
import re

analysis_project = 'physionet-data-275415'  #@param {type:"string"}

admissions_table = 'physionet-data.mimiciii_clinical.admissions'  # @param {type: "string"}
d_icd_diagnoses_table = 'physionet-data.mimiciii_clinical.d_icd_diagnoses'  # @param {type: "string"}
diagnoses_icd_table = 'physionet-data.mimiciii_clinical.diagnoses_icd'  # @param {type: "string"}
patients_table = 'physionet-data.mimiciii_clinical.patients'  # @param {type: "string"}
labs_event = 'physionet-data.mimiciii_clinical.labevents'  # @param {type: "string"}
labs_items = 'physionet-data.mimiciii_clinical.d_labitems'  # @param {type: "string"}

# Preprocess queries made with the %%bigquery magic
# by substituting these values
sub_dict = {
    'analysis_project': analysis_project,
    'admissions_table': admissions_table,
    'd_icd_diagnoses_table': d_icd_diagnoses_table,
    'diagnoses_icd_table': diagnoses_icd_table,
    'patients_table': patients_table,
    'ml_table_prefix': analysis_project + '.MIMIC.models_',
    'labs_event':labs_event,
    'labs_items':labs_items
}

# Set the default project for running queries
bigquery.magics.context.project = analysis_project

# Set up the substitution preprocessing injection
# if bigquery.magics._run_query.func_name != 'format_and_run_query':
#   original_run_query = bigquery.magics._run_query
original_run_query = bigquery.magics._run_query
def format_and_run_query(client, query, job_config=None):
  query = query.format(**sub_dict)
  return original_run_query(client, query, job_config)

bigquery.magics._run_query = format_and_run_query

print('analysis_project:', analysis_project)
print()
print('custom %%bigquery magic substitutions:')
for k, v in sub_dict.items():
  print(' ', '{%s}' % k, '→', v)

%config InlineBackend.figure_format = 'svg'
bq = bigquery.Client(project=analysis_project)  
analysis_project: physionet-data-275415

custom %%bigquery magic substitutions:
  {analysis_project} → physionet-data-275415
  {admissions_table} → physionet-data.mimiciii_clinical.admissions
  {d_icd_diagnoses_table} → physionet-data.mimiciii_clinical.d_icd_diagnoses
  {diagnoses_icd_table} → physionet-data.mimiciii_clinical.diagnoses_icd
  {patients_table} → physionet-data.mimiciii_clinical.patients
  {ml_table_prefix} → physionet-data-275415.MIMIC.models_
  {labs_event} → physionet-data.mimiciii_clinical.labevents
  {labs_items} → physionet-data.mimiciii_clinical.d_labitems

Create data set named MIMIC

In [ ]:
if 'MIMIC' not in [d.dataset_id for d in list(bq.list_datasets())]:
  dataset_id = "{}.MIMIC".format(bq.project)
  # Construct a full Dataset object to send to the API.
  # Send the dataset to the API for creation.
  # Raises google.api_core.exceptions.Conflict if the Dataset already
  # exists within the project.

  dataset = bigquery.Dataset(dataset_id)
  dataset = bq.create_dataset(dataset)  # Make an API request.

Classification model - 1 - by number of labs

In the intensive care unit, are more labs events or less likely to be fatal?

In [ ]:
%%bigquery hist_df
SELECT
  n_labs, COUNT(*) AS cnt
FROM (
  SELECT
    HADM_ID,COUNT(*) AS n_labs
  FROM
    `{labs_event}`
  WHERE HADM_ID IS NOT NULL
  GROUP BY
    HADM_ID
)
GROUP BY n_labs
ORDER BY n_labs
In [ ]:
plt.figure(figsize=(15,4))
g = sns.barplot(
    x=hist_df.n_labs, y=hist_df.cnt, color=sns.color_palette()[0])
for i, label in enumerate(g.get_xticklabels()):
  if i % 30 != 4 and i != 0:
    label.set_visible(False)
plt.xticks(rotation=90)  
plt.xlabel('Number of labs')
plt.ylabel('Count')  
plt.xlim(0,1000)
Out[ ]:
(0.0, 1000.0)

Zoom in

In [ ]:
plt.figure(figsize=(15,4))
g = sns.barplot(
    x=hist_df.n_labs, y=hist_df.cnt, color=sns.color_palette()[0])
for i, label in enumerate(g.get_xticklabels()):
  if i % 30 != 4 and i != 0:
    label.set_visible(False)
plt.xticks(rotation=90)  
plt.xlabel('Number of labs')
plt.ylabel('Count')  
plt.xlim(0,650)
Out[ ]:
(0.0, 650.0)

The mathematical explanation of this is called central limit theorem. While this is by no means a deal breaker, the thins tails we see in the distribution can be a challenge for linear-regression models. This is because the extreme points tend to affect the likelihood the most, so having fewer of them makes your model more sensitive to outliers. Regularization can help with this, but if it becomes too much of a problem we can consider a different type of model (such as support-vector machines, or robust regression) instead of generalized linear regression.

In [ ]:
%%bigquery
# BigQuery ML create model statement:
CREATE OR REPLACE MODEL `{ml_table_prefix}mortality_models_according_labs_events`

OPTIONS(
  # Use logistic_reg for discrete predictions (classification) and linear_reg
  # for continuous predictions (forecasting).
  model_type = 'logistic_reg',
  # See the below aside (𝜎 = 0.5 ⇒ 𝜆 = 2)
  l2_reg = 2,
  # Identify the column to use as the label (dependent variable)
  input_label_cols = ["died"]
)
AS
# standard SQL query to train the model with:
SELECT
  COUNT(*) AS event_labs,
  MAX(HOSPITAL_EXPIRE_FLAG) as died
FROM
  `{admissions_table}`
  INNER JOIN `{labs_event}`
  USING (HADM_ID)
GROUP BY HADM_ID
Out[ ]:
In [ ]:
%%bigquery scatter_df
SELECT
  COUNT(*) AS event_labs,
  MAX(HOSPITAL_EXPIRE_FLAG) as died
FROM
  `{admissions_table}`
  INNER JOIN `{labs_event}`
  USING (HADM_ID)  
GROUP BY HADM_ID

This is an example of the training table.

In [ ]:
scatter_df
Out[ ]:
event_labs died
0 284 0
1 496 0
2 2049 0
3 400 0
4 128 0
... ... ...
58146 29 1
58147 24 1
58148 10 1
58149 12 1
58150 10 1

58151 rows × 2 columns

Its time to check model weights

In [ ]:
%%bigquery event_labs_model_weights
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}mortality_models_according_labs_events`)
In [ ]:
event_labs_model_weights
Out[ ]:
processed_input weight category_weights
0 event_labs 0.000572 []
1 __INTERCEPT__ -2.392541 []

I see that event_labs has small weight and it is not enough to good predict

In [ ]:
params = {'max_prediction': hist_df.n_labs.max()}
In [ ]:
%%bigquery line_df --params $params
SELECT * FROM
ML.PREDICT(MODEL `{ml_table_prefix}complexity_mortality_models`, (
  SELECT * FROM
  UNNEST(GENERATE_ARRAY(1, @max_prediction)) AS event_labs
))

This is predict probabilities table.

In [ ]:
line_df
Out[ ]:
predicted_died predicted_died_probs event_labs
0 0 [{'label': 1, 'prob': 0.08378717442729142}, {'... 1
1 0 [{'label': 1, 'prob': 0.08383110096244409}, {'... 2
2 0 [{'label': 1, 'prob': 0.08387504841845957}, {'... 3
3 0 [{'label': 1, 'prob': 0.08391901680309094}, {'... 4
4 0 [{'label': 1, 'prob': 0.08396300612409194}, {'... 5
... ... ... ...
13708 1 [{'label': 1, 'prob': 0.9957219683148527}, {'l... 13709
13709 1 [{'label': 1, 'prob': 0.9957244044902884}, {'l... 13710
13710 1 [{'label': 1, 'prob': 0.9957268392843689}, {'l... 13711
13711 1 [{'label': 1, 'prob': 0.9957292726978708}, {'l... 13712
13712 1 [{'label': 1, 'prob': 0.9957317047315702}, {'l... 13713

13713 rows × 3 columns

In [ ]:
plt.figure(figsize=(10,5))
sns.regplot(
    x='event_labs',
    y='died',
    data=scatter_df,
    fit_reg=False,
    x_bins=np.arange(1,
                     scatter_df.event_labs.max() + 1)
    )

plt.plot(line_df.event_labs,
         line_df.predicted_died_probs.apply(lambda x: x[0]['prob']))

plt.xlabel('Number of labs')
plt.ylabel('Probability of death during admission')
plt.xticks(rotation=90)    
plt.xlim(0,4000)
Out[ ]:
(0.0, 4000.0)

Zoom in

In [ ]:
plt.figure(figsize=(10,5))
sns.regplot(
    x='event_labs',
    y='died',
    data=scatter_df,
    fit_reg=False,
    x_bins=np.arange(1,
                     scatter_df.event_labs.max() + 1)
    )

plt.plot(line_df.event_labs,
         line_df.predicted_died_probs.apply(lambda x: x[0]['prob']))

plt.xlabel('Number of labs')
plt.ylabel('Probability of death during admission')
plt.xticks(rotation=90)    
plt.xlim(0,500)
Out[ ]:
(0.0, 500.0)

Based on the graph, I believe that this model cannot provide us with good prediction accuracy. Therefore, I conclude that there is no relationship between the number of labs and the probability of death. Nevertheless, we will try to improve it

Classification model - 2 - by number of labs, gender, age, admission type, admission location, insurance, marital status, and ethnicity

In [ ]:
%%bigquery DF_TOLEARNING
SELECT 
  IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200, DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS AGE,
  MARITAL_STATUS,
  GENDER,
  ADMISSION_TYPE,
  ADMISSION_LOCATION,
  INSURANCE,
  ETHNICITY,  
  num_labs as NUM_LABS,
  died
 
FROM
  (SELECT
    COUNT(*) AS num_labs,
    MAX(HOSPITAL_EXPIRE_FLAG) as died,
    ANY_VALUE(ADMITTIME) as ADMITTIME,
    ANY_VALUE(MARITAL_STATUS) as MARITAL_STATUS,
    ANY_VALUE(ADMISSION_TYPE) as ADMISSION_TYPE,	 	 
    ANY_VALUE(ADMISSION_LOCATION) as ADMISSION_LOCATION,
    ANY_VALUE(INSURANCE) as INSURANCE,
    ANY_VALUE(ETHNICITY) as ETHNICITY,
   SUBJECT_ID
  FROM
    `{admissions_table}` AS adm
    JOIN `{labs_event}` AS labs
  USING (HADM_ID, SUBJECT_ID)
  GROUP BY HADM_ID, SUBJECT_ID
  )
  JOIN `{patients_table}` AS patients
  USING (SUBJECT_ID)

Example table for train

In [ ]:
DF_TOLEARNING
Out[ ]:
AGE MARITAL_STATUS GENDER ADMISSION_TYPE ADMISSION_LOCATION INSURANCE ETHNICITY NUM_LABS died
0 65.232033 MARRIED F EMERGENCY TRANSFER FROM OTHER HEALT Medicare BLACK/AFRICAN AMERICAN 581 1
1 20.199863 SINGLE M ELECTIVE TRANSFER FROM SKILLED NUR Medicaid HISPANIC OR LATINO 2893 0
2 29.144422 SINGLE F ELECTIVE TRANSFER FROM SKILLED NUR Medicare UNKNOWN/NOT SPECIFIED 4041 0
3 80.501027 MARRIED M ELECTIVE TRANSFER FROM OTHER HEALT Medicare WHITE 6873 1
4 66.272416 MARRIED F URGENT TRANSFER FROM SKILLED NUR Private WHITE 1134 1
... ... ... ... ... ... ... ... ... ...
58146 0.172485 None F EMERGENCY TRANSFER FROM HOSP/EXTRAM Private WHITE 2 0
58147 0.016427 None F EMERGENCY TRANSFER FROM HOSP/EXTRAM Private WHITE 6 0
58148 0.005476 None F EMERGENCY TRANSFER FROM HOSP/EXTRAM Private WHITE 3 0
58149 0.002738 None F EMERGENCY TRANSFER FROM HOSP/EXTRAM Medicaid UNKNOWN/NOT SPECIFIED 9 0
58150 0.008214 None F EMERGENCY TRANSFER FROM HOSP/EXTRAM Private WHITE 3 0

58151 rows × 9 columns

Time to train our model

In [ ]:
%%bigquery
CREATE OR REPLACE MODEL `{ml_table_prefix}complexity_feature_mortality`
OPTIONS(model_type='logistic_reg', l2_reg=2, input_label_cols=["died"])
AS
SELECT 
  IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200, DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
  MARITAL_STATUS,
  GENDER,
  ADMISSION_TYPE,
  ADMISSION_LOCATION,
  INSURANCE,
  ETHNICITY,  
  num_labs,
  died
 
FROM
  (SELECT
    COUNT(*) AS num_labs,
    MAX(HOSPITAL_EXPIRE_FLAG) as died,
    ANY_VALUE(ADMITTIME) as ADMITTIME,
    ANY_VALUE(MARITAL_STATUS) as MARITAL_STATUS,
    ANY_VALUE(ADMISSION_TYPE) as ADMISSION_TYPE,	 	 
    ANY_VALUE(ADMISSION_LOCATION) as ADMISSION_LOCATION,
    ANY_VALUE(INSURANCE) as INSURANCE,
    ANY_VALUE(ETHNICITY) as ETHNICITY,
   SUBJECT_ID
  FROM
    `{admissions_table}` AS adm
    JOIN `{labs_event}` AS labs
  USING (HADM_ID, SUBJECT_ID)
  GROUP BY HADM_ID, SUBJECT_ID
  )
  JOIN `{patients_table}` AS patients
  USING (SUBJECT_ID)
Out[ ]:
In [ ]:
%%bigquery weight_all
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}complexity_feature_mortality`)
ORDER BY weight DESC

Table with model weights

In [ ]:
weight_all
Out[ ]:
processed_input weight category_weights
0 age 0.008593 []
1 num_labs 0.000508 []
2 __INTERCEPT__ -0.943784 []
3 MARITAL_STATUS NaN [{'category': 'DIVORCED', 'weight': -0.3697188...
4 GENDER NaN [{'category': 'M', 'weight': -0.35951872837762...
5 ADMISSION_TYPE NaN [{'category': 'URGENT', 'weight': -0.228855164...
6 ADMISSION_LOCATION NaN [{'category': 'EMERGENCY ROOM ADMIT', 'weight'...
7 INSURANCE NaN [{'category': 'Government', 'weight': -0.55271...
8 ETHNICITY NaN [{'category': 'NATIVE HAWAIIAN OR OTHER PACIFI...

We see that num_labs still has a small weight. Age has bigger weight, but it seems to me that it is still not big enough. We open the weight in categorical variables

In [ ]:
title=weight_all['processed_input'][3:].to_list()
s=0
for i in weight_all['category_weights']:

  if len(i)>1:
    print(title[s])
    for x in i:
      print(x['category'],x['weight'])
  else:
    continue
  s+=1
  print("____________\n")    
MARITAL_STATUS
DIVORCED -0.36971886739184484
SINGLE -0.445495819534432
MARRIED -0.3304742025429302
UNKNOWN (DEFAULT) 0.08597386089763329
SEPARATED -0.3690825484020848
LIFE PARTNER -0.5261539327522753
WIDOWED -0.17327175054332558
_null_filler -0.3517818687084074
____________

GENDER
M -0.3595187283776261
F -0.31851181754370717
____________

ADMISSION_TYPE
URGENT -0.22885516490170754
NEWBORN -0.6871054040246538
ELECTIVE -0.71934737651752
EMERGENCY -0.21293061742995
____________

ADMISSION_LOCATION
EMERGENCY ROOM ADMIT -0.16722179783434138
TRANSFER FROM OTHER HEALT 0.39722388484428356
TRANSFER FROM SKILLED NUR 0.06209344808577427
HMO REFERRAL/SICK -0.4079910799703707
TRSF WITHIN THIS FACILITY -0.20629229712599448
PHYS REFERRAL/NORMAL DELI -0.6596018772416767
CLINIC REFERRAL/PREMATURE -0.3672665729480982
TRANSFER FROM HOSP/EXTRAM -0.225608350937147
** INFO NOT AVAILABLE ** -0.5647350063251118
____________

INSURANCE
Government -0.552715337942527
Medicaid -0.46657695207783545
Medicare -0.1894359145499884
Self Pay 0.02637404097157134
Private -0.49374556300729744
____________

ETHNICITY
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER 0.46805044931186823
HISPANIC/LATINO - PUERTO RICAN -0.570713394339319
BLACK/AFRICAN -0.3345189377760417
ASIAN - FILIPINO -0.4290752320459104
BLACK/HAITIAN -0.5487158425811614
WHITE - BRAZILIAN -0.6772779207839726
AMERICAN INDIAN/ALASKA NATIVE -0.29452078434396933
HISPANIC/LATINO - CUBAN -0.636342687843889
BLACK/AFRICAN AMERICAN -0.4850944398733596
WHITE - RUSSIAN -0.0583259861897774
CARIBBEAN ISLAND -0.20420885962001448
ASIAN - CHINESE -0.2284396582792536
UNABLE TO OBTAIN 0.18037080267950992
HISPANIC/LATINO - DOMINICAN -0.5649985934790349
ASIAN - OTHER -0.12870312645205434
SOUTH AMERICAN -0.3285198088346844
ASIAN - JAPANESE 0.473355420475765
OTHER -0.3862312572934892
PORTUGUESE -0.6084682429441823
WHITE - OTHER EUROPEAN -0.4896064307089073
HISPANIC OR LATINO -0.51473919006077
PATIENT DECLINED TO ANSWER -0.39176849653000806
ASIAN - CAMBODIAN 0.5209650871128666
ASIAN - THAI -0.1424943254913121
MULTI RACE ETHNICITY -0.40457786272750096
HISPANIC/LATINO - CENTRAL AMERICAN (OTHER) -0.36227071710490305
HISPANIC/LATINO - HONDURAN -0.22558533104513584
MIDDLE EASTERN -0.458296394645372
HISPANIC/LATINO - GUATEMALAN -0.5747400390494883
HISPANIC/LATINO - COLOMBIAN -0.3281505597854003
HISPANIC/LATINO - MEXICAN -0.3963811840163905
ASIAN -0.358280595012094
ASIAN - ASIAN INDIAN -0.5846269470546474
ASIAN - VIETNAMESE 0.06656378665968636
WHITE -0.3565253321501619
BLACK/CAPE VERDEAN -0.5073967748635091
HISPANIC/LATINO - SALVADORAN -0.5468487002947191
ASIAN - KOREAN -0.08691947117308549
UNKNOWN/NOT SPECIFIED 0.032426649758746956
AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE 0.17419334792125096
WHITE - EASTERN EUROPEAN -0.1968073990865628
____________

We see that the weights of the new variables are greater than the weight of the number of laboratory tests. This means that the new variables are more influence on the accuracy of the predict.

In [ ]:
def set_precision(df):
  df['precision'] = df.true_positives / (df.true_positives + df.false_positives)

def plot_precision_recall(df, label=None):
  # manually add the threshold = -∞ point
  df = df[df.true_positives != 0]
  recall = [0] + list(df.recall)
  precision = [1] + list(df.precision)
  # x=recall, y=precision line chart
  plt.plot(recall, precision, label=label)  
In [ ]:
%%bigquery model1
SELECT * FROM ML.ROC_CURVE(MODEL `{ml_table_prefix}mortality_models_according_labs_events`)
In [ ]:
%%bigquery model2
SELECT * FROM ML.ROC_CURVE(MODEL `{ml_table_prefix}complexity_feature_mortality`)
In [ ]:
eval_queries = list()
for m in ['mortality_models_according_labs_events','complexity_feature_mortality']:
  eval_queries.append(
      'SELECT * FROM ML.EVALUATE('
      'MODEL `{ml_table_prefix}{m}`)'
      .format(m=m, **sub_dict))
eval_query = '\nUNION ALL\n'.join(eval_queries)
d=bq.query(eval_query).result().to_dataframe()
d.index=['models_according_labs_events','models_according_labs_events+features']
d
Out[ ]:
precision recall accuracy f1_score log_loss roc_auc
models_according_labs_events 0.333333 0.005831 0.895581 0.011461 0.323940 0.651109
models_according_labs_events+features 0.611111 0.010000 0.892105 0.019678 0.307957 0.742379
In [ ]:
set_precision(model1)
set_precision(model2)
plot_precision_recall(model1, label='only_labs')
plot_precision_recall(model2, label='all_features')
# plt.plot(
#     np.linspace(0, 1, 2), [comp_roc.precision.min()] * 2,
#     label='null model',
#     linestyle='--')
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel(r'Recall $\left(\frac{T_p}{T_p + F_n} \right)$')
plt.ylabel(r'Precision $\left(\frac{T_p}{T_p + F_p} \right)$')
Out[ ]:
Text(0, 0.5, 'Precision $\\left(\\frac{T_p}{T_p + F_p} \\right)$')

From the ROC curves we see that adding the variables regarding a patient information and their admission improves the model.

Of course, neither of these models is very good when it comes to making predictions. For our last set of models, we'll try more earnestly to predict patient mortality.

Classification model - 3 - by number of labs, gender, age, admission type, admission location, insurance, marital status, and ethnicity and one hot encoding for 𝑚 most common labs

Where 𝑚∈ {8,16,32,64,128,256,512}

Firstly, make up the top lab events

In [ ]:
%%bigquery top_labs
SELECT ITEMID,COUNT,LABEL
FROM
(SELECT ITEMID,COUNT(*) AS COUNT
 FROM `{labs_event}`
 GROUP BY (ITEMID))
 JOIN `{labs_items}`
 USING(ITEMID)
 ORDER BY COUNT DESC
In [ ]:
top_labs
Out[ ]:
ITEMID COUNT LABEL
0 51221 881764 Hematocrit
1 50971 845737 Potassium
2 50983 808401 Sodium
3 50912 797389 Creatinine
4 50902 795480 Chloride
... ... ... ...
721 51126 1 Promyelocytes
722 51055 1 Lipase, Pleural
723 51531 1 STDYURINE
724 51378 1 Metamyelocytes
725 51535 1 CD55

726 rows × 3 columns

Example table for train

In [ ]:
%%bigquery
WITH labs AS (
    SELECT
      HADM_ID,
      COUNT(*) AS COUNT_LABS,
      MAX(IF(ITEMID = 51221, 1.0, 0.0)) as `lab_51221`,
    MAX(IF(ITEMID = 50971, 1.0, 0.0)) as `lab_50971`,
    MAX(IF(ITEMID = 50983, 1.0, 0.0)) as `lab_50983`,
    MAX(IF(ITEMID = 50912, 1.0, 0.0)) as `lab_50912`,
    MAX(IF(ITEMID = 50902, 1.0, 0.0)) as `lab_50902`,
    MAX(IF(ITEMID = 51006, 1.0, 0.0)) as `lab_51006`,
    MAX(IF(ITEMID = 50882, 1.0, 0.0)) as `lab_50882`,
    MAX(IF(ITEMID = 51265, 1.0, 0.0)) as `lab_51265`,
    MAX(IF(ITEMID = 50868, 1.0, 0.0)) as `lab_50868`,
    MAX(IF(ITEMID = 51301, 1.0, 0.0)) as `lab_51301`,
    MAX(IF(ITEMID = 51222, 1.0, 0.0)) as `lab_51222`,
    MAX(IF(ITEMID = 50931, 1.0, 0.0)) as `lab_50931`,
    MAX(IF(ITEMID = 51249, 1.0, 0.0)) as `lab_51249`,
    MAX(IF(ITEMID = 51279, 1.0, 0.0)) as `lab_51279`,
    MAX(IF(ITEMID = 51248, 1.0, 0.0)) as `lab_51248`,
    MAX(IF(ITEMID = 51250, 1.0, 0.0)) as `lab_51250`
    FROM `physionet-data.mimiciii_clinical.labevents`
    GROUP BY HADM_ID
    )

    SELECT 
    * EXCEPT (SUBJECT_ID,HADM_ID,ITEMID,ADMITTIME,DOB),
    IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
       DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
    FROM 
    (SELECT SUBJECT_ID,HADM_ID,ITEMID FROM `physionet-data.mimiciii_clinical.labevents`) 
    JOIN
    (SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY ,ADMITTIME,HOSPITAL_EXPIRE_FLAG AS died FROM `physionet-data.mimiciii_clinical.admissions`)
    USING (SUBJECT_ID,HADM_ID)
    JOIN
    (SELECT DOB,GENDER,SUBJECT_ID FROM`physionet-data.mimiciii_clinical.patients`)
    USING (SUBJECT_ID)
    JOIN labs
    USING (HADM_ID) 
    limit 100
Out[ ]:
ADMISSION_TYPE ADMISSION_LOCATION INSURANCE MARITAL_STATUS ETHNICITY died GENDER COUNT_LABS lab_51221 lab_50971 lab_50983 lab_50912 lab_50902 lab_51006 lab_50882 lab_51265 lab_50868 lab_51301 lab_51222 lab_50931 lab_51249 lab_51279 lab_51248 lab_51250 age
0 EMERGENCY EMERGENCY ROOM ADMIT Medicare MARRIED WHITE 0 M 1060 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 76.522930
1 EMERGENCY EMERGENCY ROOM ADMIT Private SINGLE WHITE 0 F 245 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 47.843943
2 EMERGENCY EMERGENCY ROOM ADMIT Medicaid None UNKNOWN/NOT SPECIFIED 1 M 453 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 41.787817
3 EMERGENCY EMERGENCY ROOM ADMIT Private MARRIED WHITE 0 F 442 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 50.146475
4 EMERGENCY EMERGENCY ROOM ADMIT Private MARRIED WHITE 0 M 457 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 32.766598
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 EMERGENCY EMERGENCY ROOM ADMIT Private MARRIED WHITE 0 M 562 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 64.835044
96 EMERGENCY EMERGENCY ROOM ADMIT Medicare WIDOWED UNKNOWN/NOT SPECIFIED 1 F 1334 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 83.619439
97 EMERGENCY EMERGENCY ROOM ADMIT Medicare SINGLE HISPANIC OR LATINO 0 M 347 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 55.310062
98 EMERGENCY TRANSFER FROM HOSP/EXTRAM Medicare WIDOWED WHITE 0 F 507 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 79.452430
99 NEWBORN CLINIC REFERRAL/PREMATURE Private None WHITE 0 M 221 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.000000

100 rows × 25 columns

Time to train model

This time around we're using l1_reg instead of l2_reg because we expect that some of our some of our many variables will not significantly impact the outcome, and we would prefer a sparse model if possible.

In [ ]:
top_m_labs = (8, 16, 32, 64, 128, 256, 512)
query_jobs = list()
for m in top_m_labs:
  labs_columns = list()
  for _, row in top_labs.iloc[:m].iterrows():
    labs_columns.append('MAX(IF(ITEMID = {0}, 1.0, 0.0))'
                             ' as `lab_{0}`'.format(row.ITEMID))
  query = """
    CREATE OR REPLACE MODEL `{ml_table_prefix}mortality_feature_top_labs_{m}`
  OPTIONS(model_type='logistic_reg', l1_reg=2, input_label_cols=["died"])
  AS
    WITH labs AS (
    SELECT
      HADM_ID,
      COUNT(*) AS COUNT_LABS,
      {labs_columns}
    FROM `{labs_event}`
    GROUP BY HADM_ID
    )

    SELECT 
    * EXCEPT (SUBJECT_ID,HADM_ID,ITEMID,ADMITTIME,DOB),
    IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
       DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
    FROM 
    (SELECT SUBJECT_ID,HADM_ID,ITEMID FROM `{labs_event}`) 
    JOIN
    (SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY ,ADMITTIME,HOSPITAL_EXPIRE_FLAG AS died FROM `{admissions_table}`)
    USING (SUBJECT_ID,HADM_ID)
    JOIN
    (SELECT DOB,GENDER,SUBJECT_ID FROM`{patients_table}`)
    USING (SUBJECT_ID)
    JOIN labs
    USING (HADM_ID)
  """.format(m=m,labs_columns=',\n    '.join(labs_columns),**sub_dict)
  query_jobs.append(bq.query(query))
for j in query_jobs:
  j.exception()

Time to plot ROC curve

In [ ]:
eval_queries = list()
for m in top_m_labs:
  eval_queries.append(
      'SELECT * FROM ML.EVALUATE('
      'MODEL `{ml_table_prefix}mortality_feature_top_labs_{}`)'
      .format(m, **sub_dict))
eval_query = '\nUNION ALL\n'.join(eval_queries)
bq.query(eval_query).result().to_dataframe()
Out[ ]:
precision recall accuracy f1_score log_loss roc_auc
0 0.476190 0.023697 0.832209 0.045147 0.416909 0.709716
1 0.538462 0.029167 0.834684 0.055336 0.415616 0.706230
2 0.503876 0.038622 0.830921 0.071744 0.406525 0.729181
3 0.875000 0.004217 0.831808 0.008393 0.394901 0.760225
4 0.766423 0.060624 0.833099 0.112360 0.375335 0.809382
5 0.661224 0.285546 0.854879 0.398851 0.345021 0.832355
6 0.729792 0.182659 0.848551 0.292187 0.359443 0.825181

Plotting ROC and precision-recall curves

In [ ]:
for m in top_m_labs:
  df = bq.query('SELECT * FROM ML.ROC_CURVE('
                'MODEL `{ml_table_prefix}mortality_feature_top_labs_{}`)'
                .format(m, **sub_dict)).result().to_dataframe()
  set_precision(df)
  plot_precision_recall(df, label='{} labs'.format(m))

# plt.plot(
#     np.linspace(0, 1, 2), [df.precision.min()] * 2,
#     label='null model',
#     linestyle='--')
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel(r'Recall $\left(\frac{T_p}{T_p + F_n} \right)$')
plt.ylabel(r'Precision $\left(\frac{T_p}{T_p + F_p} \right)$')
Out[ ]:
Text(0, 0.5, 'Precision $\\left(\\frac{T_p}{T_p + F_p} \\right)$')

The model with m = 512 seems to be overfitting the data, while somewhere between m = 128 and m = 256 seems to be the sweet spot for model flexibility.

Actually, the predictive power of our model¹ isn't nearly as interesting as it's weights and what they tell us. In the next section, we'll dig into them.

Let's have a look at the weights from the m = 128 model.

In [ ]:
%%bigquery weights_128
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}mortality_feature_top_labs_128`)
ORDER BY weight DESC
In [ ]:
weights_128
Out[ ]:
processed_input weight category_weights
0 lab_51257 0.255663 []
1 lab_50909 0.206321 []
2 lab_51218 0.204007 []
3 lab_50908 0.154342 []
4 lab_50825 0.145432 []
... ... ... ...
132 ADMISSION_LOCATION NaN [{'category': 'PHYS REFERRAL/NORMAL DELI', 'we...
133 INSURANCE NaN [{'category': 'Private', 'weight': -0.33721908...
134 MARITAL_STATUS NaN [{'category': '_null_filler', 'weight': -0.135...
135 ETHNICITY NaN [{'category': 'HISPANIC/LATINO - HONDURAN', 'w...
136 GENDER NaN [{'category': 'M', 'weight': -0.26831609486153...

137 rows × 3 columns

First we'll look at the weights for the numerical inputs.

In [ ]:
pd.options.display.max_rows=500
weights_128['ITEMID'] = weights_128.processed_input \
  .apply(lambda x: int(x.split('_')[1]) if x.startswith('lab_') else x)
weights_128  
view_df = weights_128.merge(top_labs,how='left', on='ITEMID') 
view_df = view_df[~pd.isnull(view_df.weight)]
view_df[['processed_input', 'LABEL', 'weight', 'COUNT']]
Out[ ]:
processed_input LABEL weight COUNT
0 lab_51257 Nucleated Red Cells 0.255663 16568.0
1 lab_50909 Cortisol 0.206321 13554.0
2 lab_51218 Granulocyte Count 0.204007 24375.0
3 lab_50908 CK-MB Index 0.154342 22911.0
4 lab_50825 Temperature 0.145432 129444.0
5 lab_50801 Alveolar-arterial Gradient 0.143498 22016.0
6 lab_50819 PEEP 0.142498 86911.0
7 lab_50823 Required O2 0.141914 22057.0
8 lab_50813 Lactate 0.137076 187116.0
9 lab_51214 Fibrinogen, Functional 0.134105 45935.0
10 lab_50826 Tidal Volume 0.118476 83805.0
11 lab_51104 Urea Nitrogen, Urine 0.115136 13294.0
12 lab_51275 PTT 0.110955 474930.0
13 lab_50802 Base Excess 0.110690 490651.0
14 lab_51237 INR(PT) 0.110259 471176.0
15 lab_51274 PT 0.110242 469083.0
16 lab_50804 Calculated Total CO2 0.110219 490641.0
17 lab_50818 pCO2 0.109548 490594.0
18 lab_50821 pO2 0.109307 490628.0
19 lab_50820 pH 0.104140 530752.0
20 lab_51007 Uric Acid 0.099312 18355.0
21 lab_50816 Oxygen 0.099268 127960.0
22 lab_50883 Bilirubin, Direct 0.095494 45205.0
23 lab_50817 Oxygen Saturation 0.092758 173418.0
24 lab_50954 Lactate Dehydrogenase (LD) 0.091158 107120.0
25 lab_51482 Hyaline Casts 0.090868 15118.0
26 lab_50931 Glucose 0.090173 748896.0
27 lab_51093 Osmolality, Urine 0.088802 18838.0
28 lab_50912 Creatinine 0.088614 797389.0
29 lab_51006 Urea Nitrogen 0.087416 791838.0
30 lab_50884 Bilirubin, Indirect 0.086861 44385.0
31 lab_50935 Haptoglobin 0.086001 11970.0
32 lab_50808 Free Calcium 0.085903 249110.0
33 lab_51082 Creatinine, Urine 0.083197 33062.0
34 lab_50910 Creatine Kinase (CK) 0.083075 132400.0
35 lab_51144 Bands 0.078841 75587.0
36 lab_50911 Creatine Kinase, MB Isoenzyme 0.078059 115802.0
37 lab_51003 Troponin T 0.077059 87238.0
38 lab_50862 Albumin 0.075894 146694.0
39 lab_51143 Atypical Lymphocytes 0.074792 59334.0
40 lab_51100 Sodium, Urine 0.073687 25990.0
41 lab_51251 Metamyelocytes 0.073105 59206.0
42 lab_51255 Myelocytes 0.072468 59158.0
43 lab_51009 Vancomycin 0.071942 54728.0
44 lab_51260 Ovalocytes 0.067751 16180.0
45 lab_50967 Phenytoin 0.066094 22453.0
46 lab_50878 Asparate Aminotransferase (AST) 0.054382 219452.0
47 lab_50861 Alanine Aminotransferase (ALT) 0.053886 219462.0
48 lab_50964 Osmolality, Measured 0.050698 18816.0
49 lab_50956 Lipase 0.042474 65373.0
50 lab_50885 Bilirubin, Total 0.039935 238263.0
51 lab_50827 Ventilation Rate 0.038212 77611.0
52 lab_51266 Platelet Smear 0.037483 44151.0
53 lab_50863 Alkaline Phosphatase 0.035616 207847.0
54 lab_51200 Eosinophils 0.032887 172126.0
55 lab_51146 Basophils 0.032887 172124.0
56 lab_51244 Lymphocytes 0.032880 172131.0
57 lab_51256 Neutrophils 0.032880 172131.0
58 lab_51254 Monocytes 0.032880 172130.0
59 lab_50960 Magnesium 0.029979 664123.0
60 lab_51000 Triglycerides 0.020507 24987.0
61 lab_50822 Potassium, Whole Blood 0.017437 192949.0
62 lab_50867 Amylase 0.015652 63664.0
63 lab_51476 Epithelial Cells 0.015240 83200.0
64 lab_51493 RBC 0.012671 84278.0
65 lab_51516 WBC 0.009970 84682.0
66 lab_51268 Polychromasia 0.006576 47280.0
67 lab_51519 Yeast 0.006532 85457.0
68 age NaN 0.005401 NaN
69 lab_50809 Glucose 0.002947 196736.0
70 lab_50933 Green Top Hold (plasma) 0.002788 26855.0
71 lab_51267 Poikilocytosis 0.001837 55886.0
72 lab_51137 Anisocytosis 0.001609 61381.0
73 lab_51246 Macrocytes 0.001010 60837.0
74 COUNT_LABS NaN 0.000007 NaN
75 lab_50812 Intubated -0.001601 163680.0
76 lab_51233 Hypochromia -0.002013 61634.0
77 lab_51492 Protein -0.003263 106412.0
78 lab_51252 Microcytes -0.004650 58059.0
79 lab_50887 Blue Top Hold -0.007119 21880.0
80 lab_51087 Length of Urine Collection -0.009542 46834.0
81 lab_51508 Urine Color -0.009823 100377.0
82 lab_50970 Phosphate -0.010465 590502.0
83 lab_50815 O2 Flow -0.012517 12307.0
84 lab_50800 SPECIMEN TYPE -0.013903 404785.0
85 lab_50893 Calcium, Total -0.018512 591932.0
86 lab_50920 Estimated GFR (MDRD equation) -0.028731 63253.0
87 lab_51478 Glucose -0.030781 101280.0
88 lab_51463 Bacteria -0.031828 73412.0
89 lab_51514 Urobilinogen -0.032451 101817.0
90 lab_50979 Red Top Hold -0.033283 23831.0
91 lab_51506 Urine Appearance -0.035610 99838.0
92 lab_50993 Thyroid Stimulating Hormone -0.035625 28220.0
93 lab_50828 Ventilator -0.040424 60544.0
94 lab_50806 Chloride, Whole Blood -0.041211 48188.0
95 lab_51512 Urine Mucous -0.041344 14204.0
96 lab_51486 Leukocytes -0.041521 97625.0
97 lab_51464 Bilirubin -0.043050 99680.0
98 lab_51487 Nitrite -0.043508 99675.0
99 lab_51466 Blood -0.044745 99726.0
100 lab_51498 Specific Gravity -0.045598 115858.0
101 lab_51491 pH -0.046190 116081.0
102 lab_51484 Ketone -0.046667 101624.0
103 lab_50905 Cholesterol, LDL, Calculated -0.047695 16486.0
104 lab_50824 Sodium, Whole Blood -0.055432 71504.0
105 lab_50810 Hematocrit, Calculated -0.055950 89715.0
106 lab_50811 Hemoglobin -0.055950 89712.0
107 lab_50907 Cholesterol, Total -0.059530 19444.0
108 lab_50904 Cholesterol, HDL -0.059995 18260.0
109 lab_50903 Cholesterol Ratio (Total/HDL) -0.061777 18128.0
110 lab_50902 Chloride -0.063975 795480.0
111 lab_50971 Potassium -0.066197 845737.0
112 lab_50983 Sodium -0.066981 808401.0
113 lab_50882 Bicarbonate -0.067111 780648.0
114 lab_50868 Anion Gap -0.076602 769810.0
115 lab_50952 Iron -0.076735 18205.0
116 lab_50998 Transferrin -0.089251 17283.0
117 lab_50924 Ferritin -0.090016 17946.0
118 lab_50953 Iron Binding Capacity, Total -0.091066 17229.0
119 lab_50986 tacroFK -0.126208 21630.0
120 lab_51301 White Blood Cells -0.155715 753221.0
121 lab_50852 % Hemoglobin A1c -0.156919 16618.0
122 lab_51277 RDW -0.163794 746817.0
123 lab_51279 Red Blood Cells -0.179274 747999.0
124 lab_51248 MCH -0.179274 747994.0
125 lab_51250 MCV -0.179274 747977.0
126 lab_51249 MCHC -0.181280 748147.0
127 lab_51265 Platelet Count -0.181576 778365.0
128 lab_51222 Hemoglobin -0.184594 752444.0
129 lab_51221 Hematocrit -0.260464 881764.0
130 __INTERCEPT__ NaN -0.886281 NaN

We see have a list of lab events, sorted from most fatal to least fatal according to our model.

Going back to our original question, we can see that the weight for COUNT_LABS (a.k.a the number of lab events) has essentially gone to zero (0.000007). The average lab events weight is also very small:

In [ ]:
view_df[~pd.isnull(view_df.LABEL)].weight.mean()
Out[ ]:
0.014106780876409816

So we can conclude that given that a patient has been admitted to the ICU, the number of lab events they've been given does not predict their outcome beyond the linear effect of the component diagnoses.

Let's try to improve our model by adding a diagnosis

Classification model - 4 - by number of labs, gender, age, admission type, admission location, insurance, marital status, and ethnicity and one hot encoding for the most common 128 lab events and the most common 256 diagnoses

Where 𝑚∈ {8,16,32,64,128,256,512}

Top 256 diagnoses

In [ ]:
%%bigquery top_diagnoses
WITH top_diag AS (
  SELECT COUNT(*) AS count, ICD9_CODE FROM `{diagnoses_icd_table}`
  GROUP BY ICD9_CODE
)
SELECT top_diag.ICD9_CODE, icd_lookup.SHORT_TITLE, top_diag.count FROM
top_diag JOIN
 `{d_icd_diagnoses_table}` AS icd_lookup
USING (ICD9_CODE)
ORDER BY count DESC 
In [ ]:
top_diagnoses.head()
Out[ ]:
ICD9_CODE SHORT_TITLE count
0 4019 Hypertension NOS 20703
1 4280 CHF NOS 13111
2 42731 Atrial fibrillation 12891
3 41401 Crnry athrscl natve vssl 12429
4 5849 Acute kidney failure NOS 9119

To compile example table for train take a lot of time

In [ ]:
# top_m_labs = 128
# query_jobs = list()
# top_n_diagnoses = 256
# diagnosis_columns = list()
# labs_columns = list()
# for _, row in top_labs.iloc[:top_m_labs].iterrows():
#   labs_columns.append('MAX(IF(ITEMID = {0}, 1.0, 0.0))'
#                              ' as `lab_{0}`'.format(row.ITEMID))
# for _, row in top_diagnoses.iloc[:top_n_diagnoses].iterrows():
#   diagnosis_columns.append('MAX(IF(ICD9_CODE = "{0}", 1.0, 0.0))'
#                              ' as `icd9_{0}`'.format(row.ICD9_CODE))  
#   query = """
#         WITH labs AS (
#     SELECT
#       HADM_ID,
#       COUNT(*) AS COUNT_LABS,
#       {labs_columns}
#     FROM `{labs_event}`
#     GROUP BY HADM_ID
#     )
#     SELECT * EXCEPT (HADM_ID) FROM
#     (SELECT 
#     * EXCEPT (SUBJECT_ID,ITEMID,ADMITTIME,DOB),
#     IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
#        DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
#     FROM 
#     (SELECT SUBJECT_ID,HADM_ID,ITEMID FROM `{labs_event}`) 
#     JOIN
#     (SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY ,ADMITTIME,HOSPITAL_EXPIRE_FLAG AS died FROM `{admissions_table}`)
#     USING (SUBJECT_ID,HADM_ID)
#     JOIN
#     (SELECT DOB,GENDER,SUBJECT_ID FROM`{patients_table}`)
#     USING (SUBJECT_ID)
#     JOIN labs
#     USING (HADM_ID)
#     )
#     JOIN
#     (SELECT
#       HADM_ID,
#       COUNT(*) AS num_diag,
#       {diag_cols}
#     FROM `{diagnoses_icd_table}`
#     WHERE ICD9_CODE IS NOT NULL
#     GROUP BY HADM_ID)
#     USING (HADM_ID)
#   """.format(m=m,labs_columns=',\n    '.join(labs_columns),diag_cols=',\n    '.join(diagnosis_columns),**sub_dict)
# ex.result().to_dataframe().head()

Lets go right to train model

In [ ]:
top_m_labs = 128
query_jobs = list()
top_n_diagnoses = 256
diagnosis_columns = list()
labs_columns = list()
for _, row in top_labs.iloc[:top_m_labs].iterrows():
  labs_columns.append('MAX(IF(ITEMID = {0}, 1.0, 0.0))'
                             ' as `lab_{0}`'.format(row.ITEMID))
for _, row in top_diagnoses.iloc[:top_n_diagnoses].iterrows():
  diagnosis_columns.append('MAX(IF(ICD9_CODE = "{0}", 1.0, 0.0))'
                             ' as `icd9_{0}`'.format(row.ICD9_CODE))  
  query = """
  CREATE OR REPLACE MODEL `{ml_table_prefix}predict_mortality_diag_labs`
  OPTIONS(model_type='logistic_reg', l1_reg=2, input_label_cols=["died"])
  AS
        WITH labs AS (
    SELECT
      HADM_ID,
      COUNT(*) AS COUNT_LABS,
      {labs_columns}
    FROM `{labs_event}`
    GROUP BY HADM_ID
    )
    SELECT * EXCEPT (HADM_ID) FROM
    (SELECT 
    * EXCEPT (SUBJECT_ID,ITEMID,ADMITTIME,DOB),
    IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
       DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS age,
    FROM 
    (SELECT SUBJECT_ID,HADM_ID,ITEMID FROM `{labs_event}`) 
    JOIN
    (SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,MARITAL_STATUS,ETHNICITY ,ADMITTIME,HOSPITAL_EXPIRE_FLAG AS died FROM `{admissions_table}`)
    USING (SUBJECT_ID,HADM_ID)
    JOIN
    (SELECT DOB,GENDER,SUBJECT_ID FROM`{patients_table}`)
    USING (SUBJECT_ID)
    JOIN labs
    USING (HADM_ID)
    )
    JOIN
    (SELECT
      HADM_ID,
      COUNT(*) AS num_diag,
      {diag_cols}
    FROM `{diagnoses_icd_table}`
    WHERE ICD9_CODE IS NOT NULL
    GROUP BY HADM_ID)
    USING (HADM_ID)
  """.format(labs_columns=',\n    '.join(labs_columns),diag_cols=',\n    '.join(diagnosis_columns),**sub_dict)
query_jobs.append(bq.query(query))

for j in query_jobs:
  j.exception()

Evaluation

In [ ]:
eval_queries = list()
eval_queries.append(
      'SELECT * FROM ML.EVALUATE('
      'MODEL `{ml_table_prefix}predict_mortality_diag_labs`)'
      .format( **sub_dict))
eval_query = '\nUNION ALL\n'.join(eval_queries)
bq.query(eval_query).result().to_dataframe()
Out[ ]:
precision recall accuracy f1_score log_loss roc_auc
0 0.7 0.323674 0.867588 0.442664 0.306609 0.880394

Plot precision recall curve to compare model with 128 Top Lab events+256 Top Diagnoses to model with 128 Top Lab events

In [ ]:
df1 = bq.query('SELECT * FROM ML.ROC_CURVE('
                'MODEL `{ml_table_prefix}predict_mortality_diag_labs`)'
                .format(**sub_dict)).result().to_dataframe()

df2 = bq.query('SELECT * FROM ML.ROC_CURVE('
                'MODEL `{ml_table_prefix}mortality_feature_top_labs_128`)'
                .format(m, **sub_dict)).result().to_dataframe()

df3 = bq.query('SELECT * FROM ML.ROC_CURVE('
                'MODEL `{ml_table_prefix}mortality_feature_top_labs_256`)'
                .format(m, **sub_dict)).result().to_dataframe()                

set_precision(df1)
plot_precision_recall(df1, label='256 diagnoses + 128 lab event')

set_precision(df2)
plot_precision_recall(df2, label='128 lab event')

set_precision(df3)
plot_precision_recall(df3, label='256 lab event')

# plt.plot(
#     np.linspace(0, 1, 2), [df2.precision.min()] * 2,
#     label='null model',
#     linestyle='--')
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel(r'Recall $\left(\frac{T_p}{T_p + F_n} \right)$')
plt.ylabel(r'Precision $\left(\frac{T_p}{T_p + F_p} \right)$')
Out[ ]:
Text(0, 0.5, 'Precision $\\left(\\frac{T_p}{T_p + F_p} \\right)$')

As we can see from the graph, adding the top 256 diagnoses actually improves our model.

Lets look on model weights

In [ ]:
%%bigquery weights_128_256
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}predict_mortality_diag_labs`)
ORDER BY weight DESC
In [ ]:
weights_128_256.head()
Out[ ]:
processed_input weight category_weights
0 icd9_V667 1.648685 []
1 icd9_V4986 0.918412 []
2 icd9_4275 0.727264 []
3 icd9_3484 0.592067 []
4 icd9_1977 0.501159 []

First we'll look at the weights for the numerical inputs.

In [ ]:
weights_128_256['ITEMID'] = weights_128_256.processed_input \
  .apply(lambda x: x[len('icd9_'):] if x.startswith('icd9_') else x)  
weights_128_256['ITEMID'] = weights_128_256.ITEMID \
  .apply(lambda x: int(x.split('_')[1]) if x.startswith('lab_') else x)
top_diagnoses1=top_diagnoses.rename(columns={'ICD9_CODE': 'ITEMID','SHORT_TITLE':'LABEL','count':'COUNT'})
dff=pd.concat([top_labs,top_diagnoses1])
view_df = weights_128_256.merge(dff,how='left', on='ITEMID')
view_df = view_df[~pd.isnull(view_df.weight)]
view_df[['processed_input', 'LABEL', 'weight', 'COUNT']]
Out[ ]:
processed_input LABEL weight COUNT
0 icd9_V667 Encountr palliative care 1.648685 1020.0
1 icd9_V4986 Do not resusctate status 0.918412 1327.0
2 icd9_4275 Cardiac arrest 0.727264 1361.0
3 icd9_3484 Compression of brain 0.592067 711.0
4 icd9_1977 Second malig neo liver 0.501159 794.0
5 icd9_515 Postinflam pulm fibrosis 0.458844 544.0
6 icd9_570 Acute necrosis of liver 0.431819 1067.0
7 icd9_51884 Acute & chronc resp fail 0.426374 684.0
8 icd9_5570 Ac vasc insuff intestine 0.389654 497.0
9 icd9_431 Intracerebral hemorrhage 0.377781 1367.0
10 icd9_V1011 Hx-bronchogenic malignan 0.375203 519.0
11 icd9_78551 Cardiogenic shock 0.370806 1154.0
12 icd9_42741 Ventricular fibrillation 0.352752 568.0
13 icd9_1970 Secondary malig neo lung 0.325720 754.0
14 icd9_1983 Sec mal neo brain/spine 0.320006 767.0
15 icd9_5781 Blood in stool 0.309218 690.0
16 icd9_78559 Shock w/o trauma NEC 0.307463 535.0
17 icd9_0389 Septicemia NOS 0.302748 3725.0
18 icd9_99592 Severe sepsis 0.296168 3912.0
19 icd9_5789 Gastrointest hemorr NOS 0.295692 1397.0
20 icd9_43491 Crbl art ocl NOS w infrc 0.274524 731.0
21 lab_51218 Granulocyte Count 0.268938 24375.0
22 lab_51257 Nucleated Red Cells 0.260585 16568.0
23 icd9_78959 Ascites NEC 0.251548 978.0
24 icd9_1985 Secondary malig neo bone 0.250302 843.0
25 icd9_78552 Septic shock 0.244396 2586.0
26 icd9_5712 Alcohol cirrhosis liver 0.233369 1202.0
27 icd9_5722 Hepatic encephalopathy 0.231921 666.0
28 icd9_E8889 Fall NOS 0.225236 997.0
29 icd9_51881 Acute respiratry failure 0.218681 7497.0
30 icd9_9982 Accidental op laceration 0.212922 929.0
31 icd9_29410 Dementia w/o behav dist 0.211653 613.0
32 icd9_2762 Acidosis 0.202823 4528.0
33 icd9_430 Subarachnoid hemorrhage 0.201034 658.0
34 icd9_E9331 Adv eff antineoplastic 0.199364 559.0
35 icd9_5723 Portal hypertension 0.191338 1007.0
36 icd9_2869 Coagulat defect NEC/NOS 0.189693 1016.0
37 icd9_43411 Crbl emblsm w infrct 0.187838 642.0
38 icd9_2867 Acq coagul factor defic 0.184710 466.0
39 icd9_03811 Meth susc Staph aur sept 0.183943 515.0
40 icd9_34830 Encephalopathy NOS 0.179766 674.0
41 icd9_5715 Cirrhosis of liver NOS 0.176079 1287.0
42 icd9_V103 Hx of breast malignancy 0.174465 1277.0
43 icd9_5845 Ac kidny fail, tubr necr 0.170546 2287.0
44 icd9_4928 Emphysema NEC 0.161913 641.0
45 icd9_3485 Cerebral edema 0.161549 978.0
46 lab_50909 Cortisol 0.157406 13554.0
47 icd9_V440 Tracheostomy status 0.152329 580.0
48 icd9_V707 Exam-clincal trial 0.147432 530.0
49 lab_50801 Alveolar-arterial Gradient 0.146081 22016.0
50 lab_50823 Required O2 0.144661 22057.0
51 icd9_5849 Acute kidney failure NOS 0.142057 9119.0
52 icd9_03842 E coli septicemia 0.139614 467.0
53 lab_50908 CK-MB Index 0.138029 22911.0
54 lab_51007 Uric Acid 0.136329 18355.0
55 lab_51214 Fibrinogen, Functional 0.133509 45935.0
56 icd9_4271 Parox ventric tachycard 0.133297 1811.0
57 icd9_2767 Hyperpotassemia 0.133038 2169.0
58 icd9_V4581 Aortocoronary bypass 0.132002 3056.0
59 lab_50825 Temperature 0.131626 129444.0
60 lab_50819 PEEP 0.124302 86911.0
61 icd9_2764 Mixed acid-base bal dis 0.122942 536.0
62 lab_50813 Lactate 0.120731 187116.0
63 icd9_V4501 Status cardiac pacemaker 0.120606 1390.0
64 lab_50802 Base Excess 0.118879 490651.0
65 lab_50804 Calculated Total CO2 0.118247 490641.0
66 lab_50818 pCO2 0.117099 490594.0
67 lab_50821 pO2 0.116713 490628.0
68 icd9_E8859 Fall from slipping NEC 0.115969 704.0
69 icd9_3314 Obstructiv hydrocephalus 0.115623 576.0
70 icd9_V153 Hx of irradiation 0.114857 695.0
71 icd9_42731 Atrial fibrillation 0.113709 12891.0
72 lab_50820 pH 0.112324 530752.0
73 icd9_40391 Hyp kid NOS w cr kid V 0.107330 2630.0
74 icd9_3310 Alzheimer's disease 0.101231 570.0
75 lab_50826 Tidal Volume 0.100016 83805.0
76 lab_50967 Phenytoin 0.098054 22453.0
77 icd9_41400 Cor ath unsp vsl ntv/gft 0.096524 1494.0
78 icd9_V422 Heart valve transplant 0.093209 476.0
79 lab_50883 Bilirubin, Direct 0.087005 45205.0
80 icd9_51882 Other pulmonary insuff 0.086214 761.0
81 lab_51275 PTT 0.085452 474930.0
82 icd9_25040 DMII renl nt st uncntrld 0.084709 954.0
83 lab_50816 Oxygen 0.083754 127960.0
84 lab_50808 Free Calcium 0.083463 249110.0
85 lab_51237 INR(PT) 0.083384 471176.0
86 lab_51274 PT 0.083377 469083.0
87 icd9_V462 Depend-supplement oxygen 0.083145 459.0
88 lab_51104 Urea Nitrogen, Urine 0.081175 13294.0
89 lab_50884 Bilirubin, Indirect 0.080390 44385.0
90 icd9_486 Pneumonia, organism NOS 0.080199 4839.0
91 icd9_2536 Neurohypophysis dis NEC 0.079144 505.0
92 icd9_E8809 Fall on stair/step NEC 0.078636 545.0
93 icd9_5856 End stage renal disease 0.077483 1926.0
94 lab_51260 Ovalocytes 0.075962 16180.0
95 icd9_79902 Hypoxemia 0.075034 1298.0
96 lab_50910 Creatine Kinase (CK) 0.074824 132400.0
97 lab_50912 Creatinine 0.074458 797389.0
98 lab_51006 Urea Nitrogen 0.073233 791838.0
99 lab_50817 Oxygen Saturation 0.073134 173418.0
100 icd9_25060 DMII neuro nt st uncntrl 0.071798 1138.0
101 icd9_2761 Hyposmolality 0.069153 3039.0
102 icd9_3320 Paralysis agitans 0.069063 506.0
103 icd9_34290 Unsp hemiplga unspf side 0.068790 478.0
104 icd9_2948 Mental disor NEC oth dis 0.068755 1126.0
105 icd9_V1005 Hx of colonic malignancy 0.068439 771.0
106 lab_50935 Haptoglobin 0.068298 11970.0
107 lab_51482 Hyaline Casts 0.067937 15118.0
108 icd9_41071 Subendo infarct, initial 0.067752 3055.0
109 lab_50911 Creatine Kinase, MB Isoenzyme 0.066253 115802.0
110 lab_50954 Lactate Dehydrogenase (LD) 0.062514 107120.0
111 lab_50862 Albumin 0.062398 146694.0
112 icd9_07054 Chrnc hpt C wo hpat coma 0.062317 1218.0
113 lab_51144 Bands 0.061476 75587.0
114 lab_50827 Ventilation Rate 0.061469 77611.0
115 lab_51266 Platelet Smear 0.058566 44151.0
116 lab_51093 Osmolality, Urine 0.057678 18838.0
117 icd9_4280 CHF NOS 0.057389 13111.0
118 lab_51082 Creatinine, Urine 0.056012 33062.0
119 lab_51143 Atypical Lymphocytes 0.052380 59334.0
120 lab_50964 Osmolality, Measured 0.051211 18816.0
121 icd9_V4364 Joint replaced hip 0.050768 800.0
122 lab_51251 Metamyelocytes 0.049858 59206.0
123 lab_51255 Myelocytes 0.048525 59158.0
124 lab_51100 Sodium, Urine 0.047805 25990.0
125 icd9_77089 Resp prob after brth NEC 0.043817 583.0
126 icd9_4589 Hypotension NOS 0.042985 2051.0
127 lab_50812 Intubated 0.040849 163680.0
128 icd9_43310 Ocl crtd art wo infrct 0.039402 833.0
129 icd9_V4365 Joint replaced knee 0.039253 816.0
130 lab_50878 Asparate Aminotransferase (AST) 0.039195 219452.0
131 lab_50861 Alanine Aminotransferase (ALT) 0.038757 219462.0
132 icd9_5859 Chronic kidney dis NOS 0.038690 3435.0
133 icd9_5070 Food/vomit pneumonitis 0.037701 3680.0
134 lab_50800 SPECIMEN TYPE 0.036984 404785.0
135 icd9_4439 Periph vascular dis NOS 0.033583 1401.0
136 icd9_2875 Thrombocytopenia NOS 0.033004 3065.0
137 lab_51003 Troponin T 0.032014 87238.0
138 lab_51000 Triglycerides 0.031632 24987.0
139 lab_51009 Vancomycin 0.031236 54728.0
140 lab_50885 Bilirubin, Total 0.030044 238263.0
141 lab_50931 Glucose 0.029493 748896.0
142 icd9_V4502 Status autm crd dfbrltr 0.029369 733.0
143 lab_50956 Lipase 0.029307 65373.0
144 icd9_43889 Late effect CV dis NEC 0.027783 465.0
145 lab_50867 Amylase 0.026748 63664.0
146 icd9_30391 Alcoh dep NEC/NOS-contin 0.025788 719.0
147 lab_50933 Green Top Hold (plasma) 0.025258 26855.0
148 icd9_99662 React-oth vasc dev/graft 0.023245 950.0
149 icd9_25000 DMII wo cmp nt st uncntr 0.021549 9058.0
150 lab_50863 Alkaline Phosphatase 0.020419 207847.0
151 lab_51267 Poikilocytosis 0.018900 55886.0
152 lab_51246 Macrocytes 0.018542 60837.0
153 lab_50822 Potassium, Whole Blood 0.017942 192949.0
154 lab_51137 Anisocytosis 0.017827 61381.0
155 icd9_2749 Gout NOS 0.017077 2082.0
156 lab_51233 Hypochromia 0.016848 61634.0
157 icd9_496 Chr airway obstruct NEC 0.016281 4431.0
158 lab_51200 Eosinophils 0.015145 172126.0
159 lab_51146 Basophils 0.015145 172124.0
160 lab_51244 Lymphocytes 0.015132 172131.0
161 lab_51256 Neutrophils 0.015132 172131.0
162 lab_51254 Monocytes 0.015132 172130.0
163 lab_51268 Polychromasia 0.014452 47280.0
164 lab_51508 Urine Color 0.013987 100377.0
165 icd9_5121 Iatrogenic pneumothorax 0.013132 786.0
166 icd9_41519 Pulm embol/infarct NEC 0.011491 859.0
167 lab_50809 Glucose 0.010573 196736.0
168 icd9_1120 Thrush 0.010108 646.0
169 icd9_99674 Comp-oth vasc dev/graft 0.009896 576.0
170 lab_51252 Microcytes 0.009313 58059.0
171 lab_51476 Epithelial Cells 0.008494 83200.0
172 icd9_4168 Chr pulmon heart dis NEC 0.008092 2148.0
173 lab_50887 Blue Top Hold 0.006859 21880.0
174 age NaN 0.006148 NaN
175 lab_51493 RBC 0.005752 84278.0
176 lab_51087 Length of Urine Collection 0.005620 46834.0
177 num_diag NaN 0.002170 NaN
178 icd9_73300 Osteoporosis NOS 0.002108 1947.0
179 icd9_27651 Dehydration 0.001418 1385.0
180 lab_51516 WBC 0.001375 84682.0
181 icd9_E8798 Abn react-procedure NEC 0.000687 1502.0
182 icd9_41041 AMI inferior wall, init 0.000454 573.0
183 COUNT_LABS NaN -0.000011 NaN
184 icd9_42830 Diastolc hrt failure NOS -0.000723 800.0
185 lab_51478 Glucose -0.000744 101280.0
186 icd9_V4589 Post-proc states NEC -0.001537 496.0
187 lab_51492 Protein -0.001944 106412.0
188 lab_51463 Bacteria -0.005715 73412.0
189 icd9_60000 BPH w/o urinary obs/LUTS -0.005740 1490.0
190 icd9_27652 Hypovolemia -0.006209 1374.0
191 lab_51506 Urine Appearance -0.006661 99838.0
192 lab_51519 Yeast -0.007601 85457.0
193 icd9_5853 Chr kidney dis stage III -0.008084 735.0
194 icd9_99811 Hemorrhage complic proc -0.008353 1535.0
195 icd9_3572 Neuropathy in diabetes -0.010348 1584.0
196 icd9_V1582 History of tobacco use -0.011921 2811.0
197 lab_51464 Bilirubin -0.013733 99680.0
198 lab_51487 Nitrite -0.014305 99675.0
199 icd9_3970 Tricuspid valve disease -0.014991 779.0
200 lab_51466 Blood -0.015835 99726.0
201 icd9_2760 Hyperosmolality -0.016032 2272.0
202 icd9_V5865 Long-term use steroids -0.016695 893.0
203 icd9_6826 Cellulitis of leg -0.017141 732.0
204 lab_51514 Urobilinogen -0.018230 101817.0
205 icd9_042 Human immuno virus dis -0.018456 538.0
206 icd9_V1251 Hx-ven thrombosis/embols -0.018663 1612.0
207 icd9_2449 Hypothyroidism NOS -0.019310 4917.0
208 icd9_28860 Leukocytosis NOS -0.021323 856.0
209 icd9_7140 Rheumatoid arthritis -0.022345 649.0
210 icd9_99672 Comp-oth cardiac device -0.022439 473.0
211 lab_50828 Ventilator -0.022918 60544.0
212 lab_51486 Leukocytes -0.023015 97625.0
213 lab_51484 Ketone -0.024880 101624.0
214 icd9_4148 Chr ischemic hrt dis NEC -0.025142 865.0
215 icd9_40390 Hy kid NOS w cr kid I-IV -0.025895 3421.0
216 icd9_51919 Trachea & bronch dis NEC -0.028464 534.0
217 icd9_99591 Sepsis -0.028695 1272.0
218 icd9_34590 Epilep NOS w/o intr epil -0.028848 977.0
219 icd9_2768 Hypopotassemia -0.029153 1425.0
220 icd9_49121 Obs chr bronc w(ac) exac -0.029477 1198.0
221 icd9_4412 Thoracic aortic aneurysm -0.029785 490.0
222 icd9_V4582 Status-post ptca -0.029966 2725.0
223 lab_50979 Red Top Hold -0.033042 23831.0
224 lab_50806 Chloride, Whole Blood -0.034943 48188.0
225 icd9_04104 Enterococcus group d -0.036064 614.0
226 lab_50905 Cholesterol, LDL, Calculated -0.036213 16486.0
227 lab_50815 O2 Flow -0.038934 12307.0
228 icd9_49320 Chronic obst asthma NOS -0.039116 752.0
229 icd9_5119 Pleural effusion NOS -0.040198 2734.0
230 lab_50960 Magnesium -0.040415 664123.0
231 icd9_42789 Cardiac dysrhythmias NEC -0.042718 2453.0
232 lab_50920 Estimated GFR (MDRD equation) -0.042963 63253.0
233 lab_51512 Urine Mucous -0.043338 14204.0
234 icd9_2800 Chr blood loss anemia -0.044199 1012.0
235 icd9_7242 Lumbago -0.044518 487.0
236 lab_50824 Sodium, Whole Blood -0.045934 71504.0
237 icd9_00845 Int inf clstrdium dfcile -0.047405 1444.0
238 icd9_7746 Fetal/neonatal jaund NOS -0.049047 659.0
239 lab_50907 Cholesterol, Total -0.049176 19444.0
240 lab_50904 Cholesterol, HDL -0.049557 18260.0
241 icd9_99731 Ventltr assoc pneumonia -0.049897 564.0
242 icd9_V1046 Hx-prostatic malignancy -0.051452 1207.0
243 icd9_5363 Gastroparesis -0.052633 509.0
244 lab_50903 Cholesterol Ratio (Total/HDL) -0.052949 18128.0
245 icd9_E9342 Adv eff anticoagulants -0.054420 685.0
246 lab_50993 Thyroid Stimulating Hormone -0.055052 28220.0
247 lab_50810 Hematocrit, Calculated -0.055517 89715.0
248 lab_50811 Hemoglobin -0.055517 89712.0
249 icd9_27801 Morbid obesity -0.056759 1170.0
250 icd9_2720 Pure hypercholesterolem -0.057467 5930.0
251 icd9_E8781 Abn react-artif implant -0.058560 707.0
252 icd9_9971 Surg compl-heart -0.060127 2343.0
253 icd9_53081 Esophageal reflux -0.061438 6326.0
254 lab_51498 Specific Gravity -0.062437 115858.0
255 lab_50970 Phosphate -0.063034 590502.0
256 lab_51491 pH -0.063225 116081.0
257 icd9_412 Old myocardial infarct -0.063324 3278.0
258 icd9_78039 Convulsions NEC -0.063607 1934.0
259 icd9_41011 AMI anterior wall, init -0.063762 544.0
260 icd9_4260 Atriovent block complete -0.064757 530.0
261 icd9_4019 Hypertension NOS -0.064967 20703.0
262 icd9_5601 Paralytic ileus -0.066627 1072.0
263 lab_50893 Calcium, Total -0.069920 591932.0
264 icd9_V5861 Long-term use anticoagul -0.071662 3806.0
265 icd9_2851 Ac posthemorrhag anemia -0.074864 4552.0
266 icd9_30500 Alcohol abuse-unspec -0.075109 977.0
267 icd9_V3101 Twin-mate lb-in hos w cs -0.077026 1000.0
268 icd9_36201 Diabetic retinopathy NOS -0.077623 875.0
269 icd9_5990 Urin tract infection NOS -0.077688 6555.0
270 icd9_E8497 Accid in resident instit -0.078219 1039.0
271 icd9_70703 Pressure ulcer, low back -0.078963 1289.0
272 icd9_V1254 Hx TIA/stroke w/o resid -0.080135 1072.0
273 icd9_07070 Hpt C w/o hepat coma NOS -0.083834 674.0
274 lab_50952 Iron -0.085607 18205.0
275 lab_50902 Chloride -0.086010 795480.0
276 icd9_5770 Acute pancreatitis -0.086457 961.0
277 icd9_42823 Ac on chr syst hrt fail -0.086696 1143.0
278 icd9_769 Respiratory distress syn -0.089233 1314.0
279 lab_50971 Potassium -0.089267 845737.0
280 lab_50983 Sodium -0.089541 808401.0
281 icd9_2724 Hyperlipidemia NEC/NOS -0.090139 8690.0
282 lab_50882 Bicarbonate -0.090559 780648.0
283 icd9_04111 Mth sus Stph aur els/NOS -0.092538 703.0
284 icd9_E9320 Adv eff corticosteroids -0.092666 547.0
285 icd9_42832 Chr diastolic hrt fail -0.093942 1240.0
286 icd9_78057 Sleep apnea NOS -0.095103 581.0
287 icd9_V3000 Single lb in-hosp w/o cs -0.095799 3566.0
288 icd9_28529 Anemia-other chronic dis -0.097636 1093.0
289 icd9_56210 Dvrtclo colon w/o hmrhg -0.098664 849.0
290 icd9_42822 Chr systolic hrt failure -0.099858 1097.0
291 icd9_3659 Glaucoma NOS -0.099862 711.0
292 icd9_V1083 Hx-skin malignancy NEC -0.100089 506.0
293 icd9_E8788 Abn react-surg proc NEC -0.100430 1350.0
294 icd9_4241 Aortic valve disorder -0.100455 2550.0
295 icd9_4139 Angina pectoris NEC/NOS -0.100492 1100.0
296 icd9_5939 Renal & ureteral dis NOS -0.100615 679.0
297 lab_50998 Transferrin -0.101000 17283.0
298 lab_50868 Anion Gap -0.102072 769810.0
299 icd9_3051 Tobacco use disorder -0.104205 3358.0
300 lab_50953 Iron Binding Capacity, Total -0.104653 17229.0
301 icd9_4254 Prim cardiomyopathy NEC -0.105459 1709.0
302 icd9_76518 Preterm NEC 2000-2499g -0.105670 924.0
303 icd9_V290 NB obsrv suspct infect -0.106214 5519.0
304 icd9_V3001 Single lb in-hosp w cs -0.106393 2758.0
305 icd9_4414 Abdom aortic aneurysm -0.107277 660.0
306 icd9_41401 Crnry athrscl natve vssl -0.108420 12429.0
307 icd9_E8790 Abn react-cardiac cath -0.109815 463.0
308 icd9_7907 Bacteremia -0.110384 1478.0
309 lab_50924 Ferritin -0.110444 17946.0
310 icd9_76517 Preterm NEC 1750-1999g -0.110524 474.0
311 icd9_7742 Neonat jaund preterm del -0.110989 2264.0
312 icd9_2639 Protein-cal malnutr NOS -0.111555 1259.0
313 icd9_V173 Fam hx-ischem heart dis -0.113129 703.0
314 icd9_42833 Ac on chr diast hrt fail -0.113228 1220.0
315 icd9_4240 Mitral valve disorder -0.115820 2926.0
316 lab_51301 White Blood Cells -0.116388 753221.0
317 icd9_30390 Alcoh dep NEC/NOS-unspec -0.117736 597.0
318 icd9_V5867 Long-term use of insulin -0.118644 2538.0
319 icd9_71590 Osteoarthros NOS-unspec -0.118893 891.0
320 icd9_30000 Anxiety state NOS -0.120839 1580.0
321 icd9_76528 35-36 comp wks gestation -0.121696 874.0
322 icd9_76519 Preterm NEC 2500+g -0.122264 744.0
323 icd9_76526 31-32 comp wks gestation -0.123666 487.0
324 icd9_V441 Gastrostomy status -0.124125 522.0
325 icd9_V4511 Renal dialysis status -0.125288 594.0
326 icd9_V433 Heart valve replac NEC -0.127831 679.0
327 icd9_42732 Atrial flutter -0.128148 1217.0
328 icd9_58381 Nephritis NOS in oth dis -0.129333 594.0
329 icd9_7843 Aphasia -0.129447 488.0
330 icd9_E8782 Abn react-anastom/graft -0.132178 1211.0
331 icd9_99681 Compl kidney transplant -0.133066 515.0
332 icd9_V090 Inf mcrg rstn pncllins -0.133671 658.0
333 icd9_29680 Bipolar disorder NOS -0.135089 603.0
334 icd9_2763 Alkalosis -0.135476 984.0
335 lab_51277 RDW -0.136563 746817.0
336 icd9_5761 Cholangitis -0.138026 662.0
337 icd9_27800 Obesity NOS -0.138170 1511.0
338 icd9_51889 Other lung disease NEC -0.138792 694.0
339 icd9_99812 Hematoma complic proc -0.138890 1224.0
340 icd9_2859 Anemia NOS -0.139121 5406.0
341 icd9_76527 33-34 comp wks gestation -0.140278 881.0
342 icd9_311 Depressive disorder NEC -0.140404 3431.0
343 icd9_7850 Tachycardia NOS -0.143838 607.0
344 icd9_45981 Venous insufficiency NOS -0.146923 503.0
345 icd9_45829 Iatrogenc hypotnsion NEC -0.148081 2121.0
346 icd9_5180 Pulmonary collapse -0.148402 2165.0
347 icd9_78009 Other alter consciousnes -0.148579 577.0
348 lab_50986 tacroFK -0.148825 21630.0
349 icd9_7706 NB transitory tachypnea -0.150221 754.0
350 icd9_3004 Dysthymic disorder -0.150379 1109.0
351 icd9_42821 Ac systolic hrt failure -0.152728 492.0
352 lab_51279 Red Blood Cells -0.154217 747999.0
353 lab_51248 MCH -0.154217 747994.0
354 lab_51250 MCV -0.154217 747977.0
355 icd9_5533 Diaphragmatic hernia -0.156420 663.0
356 lab_51249 MCHC -0.156459 748147.0
357 icd9_33829 Chronic pain NEC -0.157243 692.0
358 icd9_41402 Crn ath atlg vn bps grft -0.158206 474.0
359 icd9_V053 Need prphyl vc vrl hepat -0.160477 5779.0
360 lab_51222 Hemoglobin -0.160518 752444.0
361 icd9_34982 Toxic encephalopathy -0.162165 643.0
362 lab_51265 Platelet Count -0.162245 778365.0
363 lab_50852 % Hemoglobin A1c -0.163736 16618.0
364 icd9_77981 Neonatal bradycardia -0.164706 606.0
365 icd9_7455 Secundum atrial sept def -0.167146 668.0
366 icd9_28521 Anemia in chr kidney dis -0.171488 1384.0
367 icd9_29181 Alcohol withdrawal -0.173070 812.0
368 icd9_49390 Asthma NOS -0.174836 2195.0
369 icd9_99859 Other postop infection -0.178549 1186.0
370 icd9_V502 Routine circumcision -0.178908 2016.0
371 icd9_32723 Obstructive sleep apnea -0.188000 2380.0
372 icd9_6930 Drug dermatitis NOS -0.191982 543.0
373 icd9_56400 Constipation NOS -0.199376 830.0
374 icd9_79092 Abnrml coagultion prfile -0.201179 994.0
375 icd9_2809 Iron defic anemia NOS -0.202285 1171.0
376 icd9_V1581 Hx of past noncompliance -0.211387 643.0
377 icd9_4111 Intermed coronary synd -0.213257 1663.0
378 icd9_48241 Meth sus pneum d/t Staph -0.213785 789.0
379 icd9_77081 Primary apnea of newborn -0.243873 1043.0
380 lab_51221 Hematocrit -0.245935 881764.0
381 icd9_2930 Delirium d/t other cond -0.248093 1445.0
382 icd9_78820 Retention urine NOS -0.255220 1020.0
383 icd9_78791 Diarrhea -0.255940 1049.0
384 icd9_0413 Klebsiella pneumoniae -0.295880 473.0
385 icd9_78720 Dysphagia NOS -0.325711 499.0
386 icd9_34690 Migrne unsp wo ntrc mgrn -0.343134 528.0
387 __INTERCEPT__ NaN -0.937182 NaN

We see have a list of diagnoses and lab events, sorted from most fatal to least fatal according to our model.

Going back to our original question, we can see that the weight for num_diag (a.k.a the number of diagnoses) has essentially gone to zero (0.002395) as well as the weight for COUNT_LABS (a.k.a the number of lab events) is very small (-0.000011). The average diagnoses weight and lab event is also very small:

In [ ]:
view_df[~pd.isnull(view_df.LABEL)].weight.mean()
Out[ ]:
0.010698212781014632

Nonetheless, the graph shows that the first 20 lines are occupied by diagnoses, namely they have more weight. So we can conclude that diagnoses provide a better prediction than a lab event

Social determinant weights

In [ ]:
df_new=weights_128_256[pd.isnull(weights_128_256['weight'])]
title=weights_128_256[pd.isnull(weights_128_256['weight'])]['processed_input'].to_list()
s=0
for i in df_new['category_weights']:
  print(title[s])
  for x in i:
    print(x['category'],x['weight'])
  s+=1
  print("____________\n")  
ADMISSION_TYPE
NEWBORN -0.36810775387185873
ELECTIVE -0.40683197692928613
URGENT -0.33409536662646383
EMERGENCY -0.2701287519528017
____________

ADMISSION_LOCATION
HMO REFERRAL/SICK -0.10259030931933484
EMERGENCY ROOM ADMIT -0.2643567527009267
PHYS REFERRAL/NORMAL DELI -0.3435315099790993
TRSF WITHIN THIS FACILITY -0.34590625812923276
CLINIC REFERRAL/PREMATURE -0.3012130931299697
TRANSFER FROM SKILLED NUR -0.10978820484464917
TRANSFER FROM HOSP/EXTRAM -0.30747637706902115
TRANSFER FROM OTHER HEALT 0.25916254990615106
** INFO NOT AVAILABLE ** -0.32087245589246244
____________

INSURANCE
Medicare -0.22028550766516702
Private -0.3752668741003606
Self Pay -0.012848172094661934
Government -0.4056960954394789
Medicaid -0.401392460905443
____________

MARITAL_STATUS
SEPARATED -0.3793807062856737
DIVORCED -0.37973753232901925
WIDOWED -0.2063564938829422
LIFE PARTNER -0.7430609709474453
UNKNOWN (DEFAULT) -0.2669391115576381
MARRIED -0.28344501638624664
_null_filler -0.09398459312517879
SINGLE -0.3848338936398609
____________

ETHNICITY
WHITE - EASTERN EUROPEAN -0.6550293100264336
HISPANIC/LATINO - GUATEMALAN -0.5301404397977171
ASIAN - JAPANESE 0.8713440325247243
MULTI RACE ETHNICITY -0.5812604886403921
HISPANIC/LATINO - HONDURAN -0.6315139513116651
UNABLE TO OBTAIN 0.04096706506924721
UNKNOWN/NOT SPECIFIED -0.040436614633734595
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER 0.35166125175273244
ASIAN - THAI -0.5115160929017661
ASIAN - OTHER -0.5363976699628764
AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE 0.7996210539869746
OTHER -0.3483401426943742
PORTUGUESE -0.2736231676983852
WHITE - RUSSIAN -0.32877657248299974
HISPANIC/LATINO - COLOMBIAN -0.6951390152326026
SOUTH AMERICAN -0.45532205420622945
ASIAN - CAMBODIAN 0.22315609916571705
BLACK/CAPE VERDEAN -0.4962740449376235
ASIAN - FILIPINO -0.46785532874146823
HISPANIC/LATINO - SALVADORAN -0.5159388639823188
PATIENT DECLINED TO ANSWER -0.26975372484189925
WHITE - OTHER EUROPEAN -0.034776538102751386
HISPANIC/LATINO - CENTRAL AMERICAN (OTHER) -0.5680714501014497
HISPANIC OR LATINO -0.4514327957285973
HISPANIC/LATINO - DOMINICAN -0.6209761839497282
HISPANIC/LATINO - PUERTO RICAN -0.46286978967259773
WHITE -0.30457789248074146
BLACK/HAITIAN -0.017417513897143168
HISPANIC/LATINO - MEXICAN -0.5923155669904455
AMERICAN INDIAN/ALASKA NATIVE -0.23855027915080568
BLACK/AFRICAN AMERICAN -0.388912832156278
MIDDLE EASTERN -0.7984226829534431
ASIAN - KOREAN -0.6637847744698869
HISPANIC/LATINO - CUBAN -0.7436555236442132
ASIAN -0.3064883896055186
ASIAN - ASIAN INDIAN -0.7996618065316163
ASIAN - CHINESE -0.24028636822206223
WHITE - BRAZILIAN -0.5413316872726444
ASIAN - VIETNAMESE -0.369985176135227
BLACK/AFRICAN -0.7751818032076114
CARIBBEAN ISLAND -0.5774706621890164
____________

GENDER
M -0.2939957053370493
F -0.2885942657491509
____________

Bonus: Classification model - 5 - by number of labs, gender, age, admission type, admission location, insurance, marital status, and ethnicity but limit the data that was collected in the first 24 hours of admission

This time we will make a prediction based on the lab events done in the first 24 hours of admission

Example table for train

In [ ]:
%%bigquery
WITH summary AS (
SELECT * FROM
(SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,
MARITAL_STATUS,ETHNICITY,GENDER,HOSPITAL_EXPIRE_FLAG as dead,
IF(DATETIME_DIFF(CHARTTIME, ADMITTIME, HOUR)<= 24,
       1, 0) AS TIME,
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
       DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS AGE,       
FROM
( 
`{admissions_table}` JOIN 
`{labs_event}`
USING (HADM_ID,SUBJECT_ID)
JOIN
`{patients_table}`
USING (SUBJECT_ID)))
WHERE TIME=1)

SELECT *EXCEPT(HADM_ID,SUBJECT_ID,TIME) FROM
summary
JOIN
(SELECT count(*) as Num_labs,HADM_ID
From summary
GROUP BY HADM_ID)
USING (HADM_ID)
LIMIT 50
Out[ ]:
ADMISSION_TYPE ADMISSION_LOCATION INSURANCE MARITAL_STATUS ETHNICITY GENDER dead AGE Num_labs
0 EMERGENCY EMERGENCY ROOM ADMIT Medicaid None UNKNOWN/NOT SPECIFIED M 1 41.787817 154
1 ELECTIVE PHYS REFERRAL/NORMAL DELI Medicaid None WHITE M 0 45.511294 104
2 URGENT TRANSFER FROM HOSP/EXTRAM Medicare MARRIED UNKNOWN/NOT SPECIFIED F 0 65.538672 51
3 EMERGENCY EMERGENCY ROOM ADMIT Medicare MARRIED WHITE F 1 68.958248 171
4 ELECTIVE PHYS REFERRAL/NORMAL DELI Private MARRIED WHITE M 0 63.134839 430
5 EMERGENCY EMERGENCY ROOM ADMIT Medicare WIDOWED WHITE F 0 74.119097 183
6 EMERGENCY EMERGENCY ROOM ADMIT Government DIVORCED HISPANIC/LATINO - DOMINICAN M 0 51.745380 220
7 EMERGENCY TRANSFER FROM HOSP/EXTRAM Private MARRIED WHITE M 0 52.208077 129
8 EMERGENCY EMERGENCY ROOM ADMIT Medicare WIDOWED UNKNOWN/NOT SPECIFIED M 1 64.019165 116
9 EMERGENCY TRANSFER FROM HOSP/EXTRAM Medicare MARRIED WHITE M 0 74.959617 101
10 EMERGENCY EMERGENCY ROOM ADMIT Medicaid SINGLE BLACK/AFRICAN AMERICAN F 0 50.847365 202
11 EMERGENCY EMERGENCY ROOM ADMIT Medicare DIVORCED WHITE M 0 55.449692 155
12 ELECTIVE PHYS REFERRAL/NORMAL DELI Medicare MARRIED WHITE F 0 75.994524 198
13 EMERGENCY EMERGENCY ROOM ADMIT Private SINGLE WHITE F 0 19.293634 278
14 EMERGENCY CLINIC REFERRAL/PREMATURE Private MARRIED WHITE F 0 60.657084 98
15 EMERGENCY CLINIC REFERRAL/PREMATURE Medicare SINGLE HISPANIC OR LATINO M 0 48.793977 68
16 EMERGENCY CLINIC REFERRAL/PREMATURE Medicare SINGLE WHITE M 0 49.694730 186
17 ELECTIVE PHYS REFERRAL/NORMAL DELI Medicare MARRIED UNKNOWN/NOT SPECIFIED M 0 76.722793 149
18 ELECTIVE PHYS REFERRAL/NORMAL DELI Medicare MARRIED WHITE M 0 73.133470 219
19 EMERGENCY EMERGENCY ROOM ADMIT Private MARRIED WHITE M 0 41.486653 61
20 EMERGENCY EMERGENCY ROOM ADMIT Medicare WIDOWED WHITE F 0 73.492129 63
21 EMERGENCY EMERGENCY ROOM ADMIT Medicare MARRIED WHITE M 1 83.383984 159
22 URGENT TRANSFER FROM HOSP/EXTRAM Private MARRIED UNKNOWN/NOT SPECIFIED M 1 82.146475 106
23 ELECTIVE PHYS REFERRAL/NORMAL DELI Medicare MARRIED WHITE M 0 68.205339 87
24 EMERGENCY EMERGENCY ROOM ADMIT Medicare MARRIED WHITE F 1 72.750171 175
25 ELECTIVE PHYS REFERRAL/NORMAL DELI Private MARRIED WHITE M 0 59.364819 161
26 EMERGENCY EMERGENCY ROOM ADMIT Medicare WIDOWED UNKNOWN/NOT SPECIFIED F 0 75.060917 123
27 EMERGENCY TRANSFER FROM HOSP/EXTRAM Medicare MARRIED UNKNOWN/NOT SPECIFIED F 0 71.997262 146
28 EMERGENCY EMERGENCY ROOM ADMIT Private DIVORCED WHITE F 0 63.767283 192
29 EMERGENCY EMERGENCY ROOM ADMIT Medicaid SINGLE WHITE M 1 45.204654 219
30 EMERGENCY CLINIC REFERRAL/PREMATURE Medicare SINGLE WHITE F 0 56.555784 86
31 EMERGENCY EMERGENCY ROOM ADMIT Medicare MARRIED WHITE M 1 85.199179 250
32 EMERGENCY TRANSFER FROM HOSP/EXTRAM Medicaid WIDOWED WHITE F 0 39.307324 94
33 EMERGENCY EMERGENCY ROOM ADMIT Private MARRIED WHITE M 1 57.344285 97
34 EMERGENCY TRANSFER FROM HOSP/EXTRAM Medicaid WIDOWED WHITE F 0 39.307324 94
35 EMERGENCY TRANSFER FROM HOSP/EXTRAM Medicaid WIDOWED WHITE F 0 39.307324 94
36 EMERGENCY TRANSFER FROM HOSP/EXTRAM Private MARRIED UNKNOWN/NOT SPECIFIED M 1 60.870637 334
37 EMERGENCY EMERGENCY ROOM ADMIT Medicare MARRIED WHITE M 1 95.000000 164
38 EMERGENCY TRANSFER FROM OTHER HEALT Medicare MARRIED WHITE M 0 76.602327 246
39 EMERGENCY EMERGENCY ROOM ADMIT Medicare MARRIED BLACK/AFRICAN AMERICAN M 0 67.734428 127
40 EMERGENCY EMERGENCY ROOM ADMIT Government DIVORCED HISPANIC OR LATINO F 0 57.645448 132
41 EMERGENCY PHYS REFERRAL/NORMAL DELI Private MARRIED WHITE M 0 60.878850 325
42 NEWBORN PHYS REFERRAL/NORMAL DELI Private None BLACK/AFRICAN AMERICAN M 0 0.000000 82
43 EMERGENCY EMERGENCY ROOM ADMIT Private SINGLE OTHER M 0 42.464066 439
44 EMERGENCY EMERGENCY ROOM ADMIT Medicare SINGLE WHITE M 0 49.845311 106
45 EMERGENCY EMERGENCY ROOM ADMIT Self Pay None WHITE F 0 19.989049 118
46 EMERGENCY EMERGENCY ROOM ADMIT Medicare MARRIED ASIAN - CHINESE M 0 86.628337 206
47 EMERGENCY EMERGENCY ROOM ADMIT Medicaid SINGLE WHITE F 0 65.207392 119
48 EMERGENCY TRANSFER FROM HOSP/EXTRAM Medicare MARRIED WHITE F 0 71.556468 212
49 EMERGENCY EMERGENCY ROOM ADMIT Medicare WIDOWED WHITE M 0 77.470226 89
In [ ]:
%%bigquery
CREATE OR REPLACE MODEL `{ml_table_prefix}mortality_24_hours`

OPTIONS(
  model_type = 'logistic_reg',
  # See the below aside (𝜎 = 0.5 ⇒ 𝜆 = 2)
  l2_reg = 2,
  input_label_cols = ["died"]
)
AS
# standard SQL query to train the model with:
WITH summary AS (
SELECT * FROM
( 
  SELECT SUBJECT_ID,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,
MARITAL_STATUS,ETHNICITY,GENDER,HOSPITAL_EXPIRE_FLAG as died,
IF(DATETIME_DIFF(CHARTTIME, ADMITTIME, HOUR)<= 24,
       1, 0) AS TIME,
IF(DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25 < 200,
       DATETIME_DIFF(ADMITTIME, DOB, DAY)/365.25, 95) AS AGE,       
FROM
( 
`{admissions_table}` JOIN 
`{labs_event}`
USING (HADM_ID,SUBJECT_ID)
JOIN
`{patients_table}`
USING (SUBJECT_ID))
)
WHERE TIME=1)

SELECT *EXCEPT(HADM_ID,SUBJECT_ID,TIME) FROM
summary
JOIN
(SELECT count(*) as Num_labs,HADM_ID
From summary
GROUP BY HADM_ID)
USING (HADM_ID)
Out[ ]:
In [ ]:
eval_queries = list()
eval_queries.append(
      'SELECT * FROM ML.EVALUATE('
      'MODEL `{ml_table_prefix}mortality_24_hours`)'
      .format( **sub_dict))
eval_query = '\nUNION ALL\n'.join(eval_queries)
bq.query(eval_query).result().to_dataframe()
Out[ ]:
precision recall accuracy f1_score log_loss roc_auc
0 0.659091 0.044073 0.872702 0.082621 0.339861 0.74688
In [ ]:
%%bigquery weights_24
SELECT * FROM ML.WEIGHTS(MODEL `{ml_table_prefix}mortality_24_hours`)
ORDER BY weight DESC
In [ ]:
weights_24
Out[ ]:
processed_input weight category_weights
0 AGE 0.014166 []
1 Num_labs 0.006802 []
2 __INTERCEPT__ -2.079611 []
3 ADMISSION_TYPE NaN [{'category': 'EMERGENCY', 'weight': -0.199193...
4 ADMISSION_LOCATION NaN [{'category': 'HMO REFERRAL/SICK', 'weight': 0...
5 INSURANCE NaN [{'category': 'Medicaid', 'weight': -0.5151097...
6 MARITAL_STATUS NaN [{'category': 'SINGLE', 'weight': -0.510721995...
7 ETHNICITY NaN [{'category': 'HISPANIC/LATINO - SALVADORAN', ...
8 GENDER NaN [{'category': 'F', 'weight': -0.33209721014908...

We see that the Num_labs weight really increased from 0.000508 to 0.006802.But how much will this affect on predictive power?

In [ ]:
df1 = bq.query('SELECT * FROM ML.ROC_CURVE('
                'MODEL `{ml_table_prefix}mortality_24_hours`)'
                .format(**sub_dict)).result().to_dataframe()

df2 = bq.query('SELECT * FROM ML.ROC_CURVE('
                'MODEL `{ml_table_prefix}complexity_feature_mortality`)'
                .format(m, **sub_dict)).result().to_dataframe()

df3 = bq.query('SELECT * FROM ML.ROC_CURVE('
                'MODEL `{ml_table_prefix}predict_mortality_diag_labs`)'
                .format(m, **sub_dict)).result().to_dataframe()                

df4 = bq.query('SELECT * FROM ML.ROC_CURVE('
                'MODEL `{ml_table_prefix}mortality_feature_top_labs_128`)'
                .format(m, **sub_dict)).result().to_dataframe()

set_precision(df1)
plot_precision_recall(df1, label='lab event in the first 24 hours')

set_precision(df2)
plot_precision_recall(df2, label='all numbers of lab events')

set_precision(df3)
plot_precision_recall(df3, label='256 diagnoses + 128 lab event')

set_precision(df4)
plot_precision_recall(df4, label='top 128 lab event')

# plt.plot(
#     np.linspace(0, 1, 2), [df2.precision.min()] * 2,
#     label='null model',
#     linestyle='--')
plt.legend()
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel(r'Recall $\left(\frac{T_p}{T_p + F_n} \right)$')
plt.ylabel(r'Precision $\left(\frac{T_p}{T_p + F_p} \right)$')
Out[ ]:
Text(0, 0.5, 'Precision $\\left(\\frac{T_p}{T_p + F_p} \\right)$')

We see that considering only the LAB EVENTS in the first 24 hours allows us to improve our BASIC model (only LAB EVENTS). As further work, we can consider the one hot encoding of top diagnoses and top lab events assigned also in the first 24 hours. Nevertheless, we see that the use of diagnoses and labs event allows us to create a serious model (ROC 0.88).